import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os

%matplotlib inline
%config InlineBackend.figure_format = 'png'

train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)

print('preprocessing train_data')
use_col = ["srch_co","srch_ci","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]

train_y = train[["hotel_cluster"]]

train_x = train[use_col]
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).astype(int)
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
train_x = train_x.drop(["srch_children_cnt"], axis=1)
train_x = train_x[["srch_destination_id","hotel_country","srch_adults_cnt","period"]]

preprocessing train_data
srch_destination_id hotel_country srch_adults_cnt period
0 12696 8 1 1
1 12189 50 2 4
2 2758 31 1 8
3 8267 50 3 2
4 18741 50 1 3

array([15, 72, 58, 56, 42,  0, 96, 18, 95, 91, 11, 43, 12, 46, 26,  5,  2,
       16, 70, 33, 21,  4, 51, 40, 20, 89, 48, 82,  1, 79,  8, 50, 97, 47,
       63, 55, 61, 77,  6, 59,  3, 49, 64, 41, 13, 94, 98, 19, 28, 14, 37,
       62, 10, 36, 74, 80, 44, 31, 32, 29, 45,  7, 87, 99, 35, 68, 57,  9,
       81, 86, 60, 30, 52, 39, 75, 83, 78, 65, 25, 88, 90, 69, 71, 76, 23,
       66, 67, 54, 53, 92, 17, 85, 24, 22, 84, 38, 73, 34, 93, 27], dtype=int64)

# srch_destination_id = ex) 도쿄라고 했을때 나오는 그룹 => hotel_cluster 고로, srch_destination_id 와 hotel_country는 유사할것임

sum count
srch_destination_id hotel_cluster
2 20 1 1
4 67 1 1
78 1 1
81 1 1
8 7 1 1
32 1 1
42 1 1
48 1 1
76 1 1
11 91 1 1
14 20 1 1
61 1 1
16 15 1 1
85 1 1
19 64 1 1
21 62 1 1
67 3 3
82 1 1
89 1 1
24 3 1 1
23 1 1
42 1 1
47 2 2
60 1 1
76 2 2
91 3 3
25 5 1 1
10 1 1
13 2 2
32 1 1
... ... ... ...
60988 41 1 1
68 1 1
61097 28 1 1
72 1 1
61102 95 1 1
61128 12 1 1
61193 30 1 1
36 2 2
61306 60 1 1
61413 29 1 1
62 1 1
61418 58 1 1
61442 5 1 1
61528 32 1 1
49 1 1
72 1 1
61531 10 1 1
61533 11 1 1
41 2 2
83 1 1
61702 33 1 1
47 1 1
48 1 1
91 2 2
61756 56 2 2
72 1 1
77 1 1
62487 6 1 1
62508 32 1 1
62824 21 1 1

22857 rows × 2 columns

이게 baseline 앞으론 남들과 다르게 예약한 사람을 찾아서 그들을 지켜보자

train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)

print('preprocessing train_data')
use_col = ["srch_co","srch_ci","user_location_region",\

train_y = train[["hotel_cluster"]]

train_x = train[use_col]
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
train_x = train_x.drop(["srch_children_cnt"], axis=1)
train_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

use_col = ["srch_co","srch_ci","user_location_region",\
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]

print("preprocessing test_data")

test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)

test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]

print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('# Test shape : {}'.format(test.shape)),train_y)

preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))

print("save file")

result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name ="result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)

preprocessing train_data
